3.0-Normalization and Embedding¶

Inés Sentís

Date of execution

In [1]:
Sys.Date()
2025-01-09

Introduction¶

Normalize data and create embeddings for the each time point fraction

In [2]:
sample <- "41BBctr"

Libraries¶

In [3]:
suppressMessages(suppressWarnings({
library(Seurat)
library(tidyverse)
library(grid)
library(gridExtra)
library(ggplot2)
library(scater) 
library(scran)
}))

Parameters¶

In [4]:
#here::dr_here(show_reason = TRUE)
source(here::here("SCGRES_124_125/sc_analysis/misc/paths.R"))
source(here::here("SCGRES_124_125/sc_analysis/misc/variables.R"))

"{clust}/{plt_dir}" %>%
  glue::glue() %>%
  here::here() %>%
  dir.create(path = .,
           showWarnings = FALSE,
           recursive = TRUE)

"{clust}/{robj_dir}" %>%
  glue::glue() %>%
  here::here() %>%
  dir.create(path = .,
           showWarnings = FALSE,
           recursive = TRUE)

set.seed(0)

Load data¶

In [5]:
seurat_obj <- readRDS(here::here(glue::glue("{qc}/{robj_dir}/clean_combined_object_{sample}.rds")))
In [6]:
var_name <- paste0("comp_", sample)
dcomp <- get(var_name)

Normalization and linear dimensional reduction¶

In [7]:
seurat_obj <- NormalizeData(
  seurat_obj,
  normalization.method = "LogNormalize",
  scale.factor = 1e4
)
In [8]:
sce <- as.SingleCellExperiment(seurat_obj)
sce
class: SingleCellExperiment 
dim: 25804 9024 
metadata(0):
assays(2): counts logcounts
rownames(25804): AL627309.1 AL627309.5 ... AC233755.2 AC007325.4
rowData names(0):
colnames(9024): AAACCTGAGAACTGTA-1 AAACCTGAGAATGTTG-1 ...
  TTTGTCATCGTATCAG-1 TTTGTCATCTTGTCAT-1
colData names(12): orig.ident nCount_RNA ... doublet_pred ident
reducedDimNames(0):
mainExpName: RNA
altExpNames(0):
In [9]:
gene_var <- modelGeneVar(sce)

tops <- gene_var %>% 
    as.data.frame() %>% 
    arrange(desc(total)) %>% 
    head(n=20)

gene_var %>% 
  as.data.frame() %>% 
  ggplot(aes(mean, total)) +
  geom_point() +
  geom_line(aes(y = tech), colour = "dodgerblue", size = 1) +
  labs(x = "Mean of log-expression", y = "Variance of log-expression")+
  geom_text(data=tops, aes(mean,total,label=rownames(tops)))
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.”
In [10]:
gene_var %>% 
    as.data.frame() %>% 
    arrange(desc(total)) %>% 
    head(n=20)
A data.frame: 20 × 6
meantotaltechbiop.valueFDR
<dbl><dbl><dbl><dbl><dbl><dbl>
GNLY1.05473962.96137470.41966202.54171271.141553e-2291.453025e-225
CCL53.48583852.46309050.15405162.3090389 0.000000e+00 0.000000e+00
AL138963.41.86666071.20851010.32305860.8854515 8.688159e-49 1.053212e-45
IL7R1.97972601.18008810.30495240.8751357 2.772235e-53 4.704852e-50
LGALS11.42994641.17695850.39091820.7860403 3.488036e-27 1.675376e-24
GZMA1.96415421.13737790.30744050.8299374 2.173738e-47 2.405949e-44
NKG70.97827121.11632000.41851230.6978078 2.752163e-19 8.757728e-17
SELL0.91694721.02819260.41529710.6128956 1.650685e-15 4.320243e-13
ZNF6830.65115980.98537040.37230360.6130668 7.394079e-19 2.267844e-16
HIST1H1E1.26205290.97675060.41012960.5666210 8.164623e-14 1.960819e-11
HOPX1.18267660.91552860.41572800.4998006 6.884647e-11 1.436578e-08
CD742.02855850.84786920.29730720.5505619 2.382528e-23 9.052539e-21
TUBA1B0.64553320.84268590.37080490.4718810 5.451945e-12 1.239198e-09
HIST1H4C1.47702330.82585660.38422410.4416324 4.219307e-10 8.262377e-08
CST70.79456910.82580290.40123910.4245638 8.063726e-09 1.446211e-06
TRBC10.93727650.79500820.41661440.3783938 6.204811e-07 9.291523e-05
LRRN31.18168610.79191420.41578340.3761308 6.839808e-07 1.012331e-04
HIST1H1D0.90692610.79036100.41455030.3758107 6.496574e-07 9.671537e-05
RIPOR21.62317810.76668580.36193180.4047540 1.183178e-09 2.247774e-07
TUBB1.17958130.75703460.41589980.3411349 5.960336e-06 7.624737e-04
In [11]:
hvgs <- getTopHVGs(gene_var,fdr.threshold = 0.05)
length(hvgs)
337
In [12]:
hvgs <- hvgs[!grepl("^TR[AB][VJC]", hvgs)]
length(hvgs)
240
In [13]:
seurat_obj <- seurat_obj %>%
  ScaleData(features=hvgs) %>% 
  RunPCA(features=hvgs)
Centering and scaling data matrix

PC_ 1 
Positive:  LTB, IL7R, FTL, PASK, EEF1B2, RIPOR2, CCR7, SELL, TCF7, NELL2 
	   MTRNR2L12, CD52, AL138963.4, TMSB10, LRRN3, MALAT1, S100B, S100A4, KRT1, RPLP0 
	   KIF5C, HPGDS, KLRB1, LINC02315, TRDV3, LINC02109, KIR2DL3, FCRL3, TRGV11, AC005699.1 
Negative:  UBE2C, TOP2A, ASPM, KIFC1, CDK1, RRM2, HJURP, CKAP2L, DLGAP5, KNL1 
	   GTSE1, BIRC5, KIF15, CDCA8, NUSAP1, TPX2, HMMR, HIST1H3B, CENPF, MKI67 
	   CCNB2, KIF14, STMN1, PLK1, HIST1H1B, CDC20, HIST1H3C, PCLAF, TUBA1B, HIST1H3G 
PC_ 2 
Positive:  EEF1B2, IL7R, RPLP0, SELL, CCR7, TCF7, IFITM1, RIPOR2, PASK, LTB 
	   LRRN3, AL138963.4, IFITM2, KRT1, TMSB10, PDE3B, TUBA1B, RRM2, HMMR, KIFC1 
	   HIST1H3B, UBE2C, HJURP, HIST1H4C, CDK1, GTSE1, HIST1H3G, CDCA8, TUBB, PLK1 
Negative:  CST7, NKG7, GNLY, CCL4, ALOX5AP, HLA-DRB1, GZMH, CCL5, ZNF683, GZMB 
	   CD2, RCBTB2, CXCR3, CD74, HLA-DRA, LINC02694, HLA-DQA1, JAML, GZMK, CCL4L2 
	   AOAH, LSP1, PPP1R14B, CXCR6, AC013652.1, KLRK1, IL32, CTSW, GZMA, HOPX 
PC_ 3 
Positive:  PDE7B, KRT1, HPGDS, LINC02694, RTKN2, AC013652.1, GZMH, MALAT1, CST7, PPP1R14B 
	   ALOX5AP, PDE3B, CCL4, ACTG1, GAPDH, PLEK, CCL4L2, ACTB, LSP1, CYP1B1 
	   CCL3, HLA-DQA1, KLRB1, FOXP3, RCBTB2, GZMB, TRDC, HLA-DRA, HIST1H1C, HIST1H2AG 
Negative:  HOPX, CTSW, KLRC4, CCL5, KLRK1, LTB, CD52, NELL2, IFITM2, CD8B 
	   RIPOR2, LINC02446, SPINK2, SELL, XCL1, IFITM1, CXCR3, PASK, CCR7, TMSB10 
	   S100A4, CD7, AOAH, GZMA, ZNF683, S100B, LRRN3, NKG7, LGALS1, TCF7 
PC_ 4 
Positive:  IL32, CD7, SPINK2, XCL1, IFITM2, GAPDH, ACTG1, LSP1, FTL, JAML 
	   IFITM1, ACTB, LGALS1, CXCR6, KRT1, KRT86, HPGDS, S100A4, HIST1H1C, PDE7B 
	   RPLP0, RTKN2, KLRB1, CYP1B1, HMGN2, FOXP3, HOPX, CD2, HIST1H1E, PDE3B 
Negative:  RIPOR2, TMSB10, SELL, GZMH, GZMK, GZMA, CCL4, PLEK, PPP1R14B, HLA-DRB1 
	   CCL4L2, CCR7, HLA-DRA, HLA-DQA1, PASK, CCL3, GZMB, CD52, CD8B, LINC02694 
	   GNLY, LTB, AC013652.1, MALAT1, IL7R, KLRC1, MT-ND6, CD74, NKG7, KLRK1 
PC_ 5 
Positive:  CDC20, CCNB1, LGALS1, CCNB2, PTTG1, PLK1, TMSB10, CD52, HMMR, KIF14 
	   S100A4, TRDC, DLGAP5, BIRC5, CRIP1, CENPE, IFITM2, ASPM, GTSE1, RTKN2 
	   IFITM1, TPX2, LTB, CDCA8, CENPF, PDE7B, GZMA, TYROBP, UBE2C, KLRD1 
Negative:  HIST1H1E, HIST1H1C, HIST1H1D, HIST1H2AL, HIST1H2AI, HIST1H2AH, HIST1H3C, HIST1H2AB, HIST1H3D, HIST2H2AB 
	   HIST1H2AG, HIST1H3F, NELL2, HIST1H1B, HIST1H3B, AL138963.4, CD8B, KIF5C, HIST1H3G, AOAH 
	   MT-ND6, MALAT1, LRRN3, KLRC4, PDE3B, RRM2, TCF7, HIST1H4C, SPINK2, CCR7 

In [14]:
options(repr.plot.width = 8, repr.plot.height = 6)
ElbowPlot(seurat_obj, n=50)
In [15]:
options(repr.plot.width = 14, repr.plot.height = 6)
FeaturePlot(object = seurat_obj, reduction = "pca",
        features = c("nCount_RNA","nFeature_RNA"), order=T)
In [16]:
options(repr.plot.width = 16, repr.plot.height = 16, warn=-1,verbose = FALSE)
genes = c("CD3E", "CD3D",
          "CD4", "CD8A","CD8B","FOXP3",
          "TOP2A", "IL7R","SELL")

FeaturePlot(seurat_obj, reduction = "pca", 
            feature=genes, order = F, ncol=3)
In [17]:
options(repr.plot.width = 10, repr.plot.height = 6, warn=-1,verbose = FALSE)
VizDimLoadings(seurat_obj, dims = 1:2, reduction = "pca")

UMAP representation¶

In [18]:
seurat_obj <- RunUMAP(
  seurat_obj,
  dims = dcomp,
  reduction = "pca",
  reduction.name = "umap",
  reduction.key = "UMAP_"
)
15:56:45 UMAP embedding parameters a = 0.9922 b = 1.112

15:56:45 Read 9024 rows and found 12 numeric columns

15:56:45 Using Annoy for neighbor search, n_neighbors = 30

15:56:45 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

15:56:46 Writing NN index file to temp file /scratch_tmp/33937423/RtmpozxZ8i/file3ef0b3b066ef0

15:56:46 Searching Annoy index using 1 thread, search_k = 3000

15:56:49 Annoy recall = 100%

15:56:50 Commencing smooth kNN distance calibration using 1 thread
 with target n_neighbors = 30

15:56:51 Initializing from normalized Laplacian + noise (using irlba)

15:56:51 Commencing optimization for 500 epochs, with 372032 positive edges

15:57:16 Optimization finished

In [19]:
options(repr.plot.width = 8, repr.plot.height = 6, warn=-1,verbose = FALSE)
DimPlot(
  seurat_obj,
  reduction = "umap",
  pt.size = 0.1
) + ggtitle('UMAP') + 
    theme(plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))

More quality checks on UMAPs¶

Compute Cell-Cycle Scores¶

In [20]:
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
seurat_obj <- CellCycleScoring(seurat_obj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

Plot several features¶

In [21]:
cat_vars <-c("Phase")
con_vars <- c("nCount_RNA", "nFeature_RNA", "pct_mt", "percent.ribo", 
              "doublet_score", "PTPRC", "CD4", "CD8A", "CD8B", "TOP2A")
vars <- c(cat_vars, con_vars)
In [22]:
# compute plots
list_plots <- lapply(vars, function(var){
  if (var %in% cat_vars) {
      p <- DimPlot(seurat_obj, reduction = "umap", group.by=var)
  } else {
      p <- FeaturePlot(seurat_obj, reduction = "umap", feature=var, order = TRUE)
  }
  return(p)
})
In [23]:
options(repr.plot.width = 16, repr.plot.height = 12, warn=-1,verbose = FALSE)
# show plots
cp <- cowplot::plot_grid(plotlist = list_plots,
                   align = "hv",
                   axis = "trbl",
                   ncol = 4,
                   nrow = 3)
cp
In [24]:
options(repr.plot.width = 15, repr.plot.height = 22, warn=-1,verbose = FALSE)
genes = c("PTPRC","SMARCB1","TOP2A", "MKI67","CD3E", "CD3D",
           "CD4", "CD8A","CD8B", "TRDC","TRGV1","TRGV2", "FOXP3")

FeaturePlot(seurat_obj, reduction = "umap", 
            feature=genes, order = T, ncol=3)

Save¶

In [25]:
saveRDS(seurat_obj, here::here(glue::glue("{clust}/{robj_dir}/dimred_combined_object_{sample}.rds")))

Session Info¶

In [26]:
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/groups/singlecell/isentis/conda_envs/ines_r4.1.1c/lib/libopenblasp-r0.3.24.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    grid      stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] scran_1.22.1                scater_1.22.0              
 [3] scuttle_1.4.0               SingleCellExperiment_1.16.0
 [5] SummarizedExperiment_1.24.0 Biobase_2.54.0             
 [7] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
 [9] IRanges_2.28.0              S4Vectors_0.32.4           
[11] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
[13] matrixStats_1.0.0           gridExtra_2.3              
[15] lubridate_1.9.3             forcats_1.0.0              
[17] stringr_1.5.0               dplyr_1.1.3                
[19] purrr_1.0.2                 readr_2.1.4                
[21] tidyr_1.3.0                 tibble_3.2.1               
[23] ggplot2_3.4.4               tidyverse_2.0.0            
[25] SeuratObject_4.1.4          Seurat_4.0.5               

loaded via a namespace (and not attached):
  [1] uuid_1.1-1                plyr_1.8.9               
  [3] igraph_1.3.4              repr_1.1.6               
  [5] lazyeval_0.2.2            sp_2.1-0                 
  [7] splines_4.1.1             BiocParallel_1.28.3      
  [9] listenv_0.9.0             scattermore_1.2          
 [11] digest_0.6.33             htmltools_0.5.6.1        
 [13] viridis_0.6.4             fansi_1.0.5              
 [15] magrittr_2.0.3            ScaledMatrix_1.2.0       
 [17] tensor_1.5                cluster_2.1.4            
 [19] ROCR_1.0-11               limma_3.50.3             
 [21] tzdb_0.4.0                globals_0.16.2           
 [23] timechange_0.2.0          spatstat.sparse_3.0-1    
 [25] colorspace_2.1-0          ggrepel_0.9.4            
 [27] crayon_1.5.2              RCurl_1.98-1.12          
 [29] jsonlite_1.8.7            progressr_0.14.0         
 [31] spatstat.data_3.0-1       survival_3.5-7           
 [33] zoo_1.8-12                glue_1.6.2               
 [35] polyclip_1.10-4           gtable_0.3.4             
 [37] zlibbioc_1.40.0           XVector_0.34.0           
 [39] leiden_0.4.3              DelayedArray_0.20.0      
 [41] BiocSingular_1.10.0       future.apply_1.11.0      
 [43] abind_1.4-5               scales_1.2.1             
 [45] edgeR_3.36.0              spatstat.random_3.1-5    
 [47] miniUI_0.1.1.1            Rcpp_1.0.10              
 [49] viridisLite_0.4.2         xtable_1.8-4             
 [51] dqrng_0.3.1               reticulate_1.34.0        
 [53] spatstat.core_2.4-2       rsvd_1.0.5               
 [55] metapod_1.2.0             htmlwidgets_1.6.2        
 [57] httr_1.4.7                RColorBrewer_1.1-3       
 [59] ellipsis_0.3.2            ica_1.0-3                
 [61] farver_2.1.1              pkgconfig_2.0.3          
 [63] uwot_0.1.16               deldir_1.0-9             
 [65] here_1.0.1                locfit_1.5-9.8           
 [67] utf8_1.2.3                labeling_0.4.3           
 [69] tidyselect_1.2.0          rlang_1.1.1              
 [71] reshape2_1.4.4            later_1.3.1              
 [73] munsell_0.5.0             tools_4.1.1              
 [75] cli_3.6.1                 generics_0.1.3           
 [77] ggridges_0.5.4            evaluate_0.22            
 [79] fastmap_1.1.1             goftest_1.2-3            
 [81] fitdistrplus_1.1-11       RANN_2.6.1               
 [83] sparseMatrixStats_1.6.0   pbapply_1.7-2            
 [85] future_1.33.0             nlme_3.1-162             
 [87] mime_0.12                 compiler_4.1.1           
 [89] beeswarm_0.4.0            plotly_4.10.2            
 [91] png_0.1-8                 spatstat.utils_3.0-3     
 [93] statmod_1.5.0             stringi_1.7.12           
 [95] bluster_1.4.0             lattice_0.21-8           
 [97] IRdisplay_1.1             Matrix_1.6-1.1           
 [99] vctrs_0.6.4               pillar_1.9.0             
[101] lifecycle_1.0.3           spatstat.geom_3.2-5      
[103] lmtest_0.9-40             BiocNeighbors_1.12.0     
[105] RcppAnnoy_0.0.21          data.table_1.14.8        
[107] cowplot_1.1.1             bitops_1.0-7             
[109] irlba_2.3.5.1             httpuv_1.6.11            
[111] patchwork_1.1.3           R6_2.5.1                 
[113] promises_1.2.1            KernSmooth_2.23-22       
[115] vipor_0.4.7               parallelly_1.36.0        
[117] codetools_0.2-19          MASS_7.3-60              
[119] rprojroot_2.0.3           withr_2.5.1              
[121] sctransform_0.4.0         GenomeInfoDbData_1.2.7   
[123] mgcv_1.8-42               parallel_4.1.1           
[125] hms_1.1.3                 beachmat_2.10.0          
[127] rpart_4.1.19              IRkernel_1.3.2.9000      
[129] DelayedMatrixStats_1.16.0 Cairo_1.6-2              
[131] Rtsne_0.16                pbdZMQ_0.3-10            
[133] shiny_1.7.5.1             base64enc_0.1-3          
[135] ggbeeswarm_0.7.2